Unemployment in India¶
In [6]:
#import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import calendar
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
In [7]:
df = pd.read_csv("Unemployment in india.csv")
df.head()
Out[7]:
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural |
In [8]:
df.tail()
Out[8]:
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 763 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 764 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 765 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 766 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 767 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Region 740 non-null object 1 Date 740 non-null object 2 Frequency 740 non-null object 3 Estimated Unemployment Rate (%) 740 non-null float64 4 Estimated Employed 740 non-null float64 5 Estimated Labour Participation Rate (%) 740 non-null float64 6 Area 740 non-null object dtypes: float64(3), object(4) memory usage: 42.1+ KB
In [10]:
df.columns = ['region','date','frequency','estimated unemployment rate','estimated employed','estimated labour participation rate','area']
df.head()
Out[10]:
| region | date | frequency | estimated unemployment rate | estimated employed | estimated labour participation rate | area | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural |
In [11]:
df.shape
Out[11]:
(768, 7)
In [12]:
df.columns
Out[12]:
Index(['region', 'date', 'frequency', 'estimated unemployment rate',
'estimated employed', 'estimated labour participation rate', 'area'],
dtype='object')
In [13]:
df.describe()
Out[13]:
| estimated unemployment rate | estimated employed | estimated labour participation rate | |
|---|---|---|---|
| count | 740.000000 | 7.400000e+02 | 740.000000 |
| mean | 11.787946 | 7.204460e+06 | 42.630122 |
| std | 10.721298 | 8.087988e+06 | 8.111094 |
| min | 0.000000 | 4.942000e+04 | 13.330000 |
| 25% | 4.657500 | 1.190404e+06 | 38.062500 |
| 50% | 8.350000 | 4.744178e+06 | 41.160000 |
| 75% | 15.887500 | 1.127549e+07 | 45.505000 |
| max | 76.740000 | 4.577751e+07 | 72.570000 |
In [14]:
df.isnull().sum()
Out[14]:
region 28 date 28 frequency 28 estimated unemployment rate 28 estimated employed 28 estimated labour participation rate 28 area 28 dtype: int64
In [15]:
df.duplicated().any()
Out[15]:
True
In [16]:
df.area.value_counts()
Out[16]:
area Urban 381 Rural 359 Name: count, dtype: int64
In [17]:
df['Date'] = pd.to_datetime(df['date'],dayfirst = True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 region 740 non-null object 1 date 740 non-null object 2 frequency 740 non-null object 3 estimated unemployment rate 740 non-null float64 4 estimated employed 740 non-null float64 5 estimated labour participation rate 740 non-null float64 6 area 740 non-null object 7 Date 740 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(3), object(4) memory usage: 48.1+ KB
In [18]:
df['month_int'] = df['Date'].dt.month
df.head()
Out[18]:
| region | date | frequency | estimated unemployment rate | estimated employed | estimated labour participation rate | area | Date | month_int | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural | 2019-05-31 | 5.0 |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural | 2019-06-30 | 6.0 |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural | 2019-07-31 | 7.0 |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural | 2019-08-31 | 8.0 |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural | 2019-09-30 | 9.0 |
In [19]:
df = df.dropna()
df['month'] = df['month_int'].astype(int).apply(lambda x: calendar.month_abbr[x])
df.head()
Out[19]:
| region | date | frequency | estimated unemployment rate | estimated employed | estimated labour participation rate | area | Date | month_int | month | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural | 2019-05-31 | 5.0 | May |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural | 2019-06-30 | 6.0 | Jun |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural | 2019-07-31 | 7.0 | Jul |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural | 2019-08-31 | 8.0 | Aug |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural | 2019-09-30 | 9.0 | Sep |
In [20]:
data = df.groupby(['month'])[['estimated unemployment rate','estimated employed','estimated labour participation rate']].mean()
data=pd.DataFrame(data).reset_index()
In [21]:
month = data.month
unemployment_rate = data['estimated unemployment rate']
labour_participation_rate = data['estimated labour participation rate']
fig = go.Figure()
fig.add_trace(go.Bar(x = month,y = unemployment_rate,name = 'Unemployment Rate'))
fig.add_trace(go.Bar(x = month,y = labour_participation_rate,name = 'Labour Participation Rate'))
fig.update_layout(title = 'Unemployment Rate and Labour Participation',xaxis = {'categoryorder':'array','categoryarray':['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct']} )
In [22]:
fig.show()
In [23]:
import plotly.express as px
In [24]:
fig = px.bar(data,x='month',y='estimated employed',color='month',category_orders ={'month':['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct']},
title='Estimated employed people from Jan 2020 to Oct 2020')
fig.show()
In [25]:
area = df.groupby(['area'])[['estimated unemployment rate','estimated employed','estimated labour participation rate']].mean()
area = pd.DataFrame(area).reset_index()
In [26]:
# Box plot
fig = px.box(data_frame=df,x='area',y='estimated unemployment rate',color='area',title='Unemployment rate')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
In [27]:
# average unemployment rate bar plot
fig = px.bar(area,x='area',y='estimated unemployment rate',color='area',title='Average unemployment rate (area)')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
In [28]:
# Bar plot Unemployment Rate (monthly)
fig = px.bar(df,x='area',y='estimated unemployment rate',animation_frame='month',color='area',title='Unemployment rate from Jan 2020 to Oct 2020(State)')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
In [29]:
# Filter data before and during lockdown
before_lockdown = df[df['Date'] < '2020-03-25']
In [30]:
during_lockdown = df[df['Date'] >= '2020-03-25']
# Average Unemployment Rate before and during lockdown
avg_unemployment_before = before_lockdown['estimated unemployment rate'].mean()
avg_unemployment_during = during_lockdown['estimated unemployment rate'].mean()
print(f"Average Unemployment Rate before lockdown: {avg_unemployment_before:.2f}%")
print(f"Average Unemployment Rate during lockdown: {avg_unemployment_during:.2f}%")
# Percentage change in Unemployment Rate
percentage_change = ((avg_unemployment_during - avg_unemployment_before) /avg_unemployment_before) * 100
print(f"Percentage Change in Unemployment Rate: {percentage_change:.2f}%")
Average Unemployment Rate before lockdown: 9.51% Average Unemployment Rate during lockdown: 17.77% Percentage Change in Unemployment Rate: 86.91%
In [31]:
fig=px.scatter_geo(df,'region',color='region',hover_name='region',size='estimated unemployment rate',animation_frame='month',scope='asia',title='Impact of lockdown on employment in India')
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] =2000
fig.update_geos(lataxis_range=[5,40],lonaxis_range=[65,100],oceancolor='lightblue',showocean=True)
fig.show()
In [32]:
df.region.unique()
Out[32]:
array(['Andhra Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Delhi', 'Goa',
'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu & Kashmir',
'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh',
'Maharashtra', 'Meghalaya', 'Odisha', 'Puducherry', 'Punjab',
'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
'Uttar Pradesh', 'Uttarakhand', 'West Bengal', 'Chandigarh'],
dtype=object)
In [33]:
# numeric data grouped by region
region = df.groupby(['region'])[['estimated unemployment rate','estimated employed','estimated labour participation rate']].mean()
region = pd.DataFrame(region).reset_index()
In [34]:
import plotly.express as px
# Specify dimensions and color parameter for the scatter matrix plot
dimensions = ['estimated unemployment rate', 'estimated employed', 'estimated labour participation rate']
color_column = 'region'
# Create scatter matrix plot with Plotly Express
fig = px.scatter_matrix(df,dimensions=dimensions,color=color_column,title='Scatter Matrix Plot Colored by Region')
# Display the plot
fig.show()
In [47]:
import dash
import dash_core_components as dcc
import dash_html_components as html
app = dash.Dash(__name__)
# Define layout
app.layout = html.Div([
dcc.Graph(id='unemployment-trend', figure={'data': [{'x': df['Date'], 'y': df['estimated unemployment rate'],'type': 'line', 'name': 'Unemployment Rate'}],'layout': {'title': 'Unemployment Rate Over Time'}})])
if __name__ == '__main__':
app.run_server(debug=True)
In [36]:
# Average Unemployment Rate
fig = px.bar(region,x='region',y='estimated unemployment rate',color='region',title='Average unemployment rate(region)')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
In [37]:
fig = px.bar(df,x='region',y='estimated unemployment rate',animation_frame='month',color='area',title='Unemployment rate from Jan 2020 to Oct 2020')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] =2000
fig.show()
In [38]:
unemployment =df.groupby(['region','area'])['estimated unemployment rate'].mean().reset_index()
unemployment.head()
Out[38]:
| region | area | estimated unemployment rate | |
|---|---|---|---|
| 0 | Andhra Pradesh | Rural | 5.526429 |
| 1 | Andhra Pradesh | Urban | 9.427857 |
| 2 | Assam | Rural | 4.490833 |
| 3 | Assam | Urban | 8.088571 |
| 4 | Bihar | Rural | 16.770000 |
In [39]:
fig = px.sunburst(unemployment,path=['region','area'],values='estimated unemployment rate',title ='Unemployment rate in area and region',height=600)
fig.show()
In [40]:
# data representation before and after lockdown
before_lockdown = df[(df['month_int']>=1) &(df['month_int'] <4)]
after_lockdown = df[(df['month_int'] >=4) & (df['month_int'] <=6)]
In [48]:
af_lockdown = after_lockdown.groupby('area')['estimated unemployment rate'].mean().reset_index()
lockdown = before_lockdown.groupby('area')['estimated unemployment rate'].mean().reset_index()
lockdown['unemployment rate before lockdown'] = af_lockdown['estimated unemployment rate']
In [42]:
lockdown.columns = ['area','unemployment rate before lockdown','unemployment rate after lockdown']
lockdown.head()
Out[42]:
| area | unemployment rate before lockdown | unemployment rate after lockdown | |
|---|---|---|---|
| 0 | Rural | 8.735132 | 13.909843 |
| 1 | Urban | 11.561951 | 17.177293 |
In [43]:
# unenployment rate change after lockdown
lockdown['rate change in unemployment'] =round(lockdown['unemployment rate before lockdown']-lockdown['unemployment rate before lockdown']
/lockdown['unemployment rate after lockdown'],2)
In [44]:
fig = px.bar(lockdown,x='area',y='rate change in unemployment',color='rate change in unemployment',title='Percentage change in Unemployment rate in each state after lockdown',template='ggplot2')
fig.update_layout(xaxis={'categoryorder':'total ascending'})
fig.show()
In [45]:
# Calculate correlation matrix
correlation_matrix = df[['estimated unemployment rate', 'estimated employed','estimated labour participation rate']].corr()
# Plot correlation heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()
In [46]:
from statsmodels.tsa.arima.model import ARIMA
# Fit ARIMA model
model = ARIMA(df['estimated unemployment rate'], order=(1, 1, 1))
model_fit = model.fit()
# Forecast future unemployment rates
forecast = model_fit.forecast(steps=12)
print(forecast)
C:\Users\yrath\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting. C:\Users\yrath\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting. C:\Users\yrath\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
740 10.456696 741 10.723949 742 10.843648 743 10.897259 744 10.921271 745 10.932025 746 10.936842 747 10.939000 748 10.939966 749 10.940399 750 10.940593 751 10.940679 Name: predicted_mean, dtype: float64
C:\Users\yrath\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
In [ ]: